Name : Prajakta Ramesh Chavan.
Date:23/02/2024
Domain: Data Science -oasis Infobyte
Task no:02: Unemployment analysis with Python
#importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import calendar
import datetime as dt
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from IPython.display import HTML
df = pd.read_csv(r'C:\Users\Dell\Desktop\dataset\Unemployment_Rate.csv')
df
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Region.1 | longitude | latitude | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-01-2020 | M | 5.48 | 16635535 | 41.02 | South | 15.9129 | 79.740 |
| 1 | Andhra Pradesh | 29-02-2020 | M | 5.83 | 16545652 | 40.90 | South | 15.9129 | 79.740 |
| 2 | Andhra Pradesh | 31-03-2020 | M | 5.79 | 15881197 | 39.18 | South | 15.9129 | 79.740 |
| 3 | Andhra Pradesh | 30-04-2020 | M | 20.51 | 11336911 | 33.10 | South | 15.9129 | 79.740 |
| 4 | Andhra Pradesh | 31-05-2020 | M | 17.43 | 12988845 | 36.46 | South | 15.9129 | 79.740 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 262 | West Bengal | 30-06-2020 | M | 7.29 | 30726310 | 40.39 | East | 22.9868 | 87.855 |
| 263 | West Bengal | 31-07-2020 | M | 6.83 | 35372506 | 46.17 | East | 22.9868 | 87.855 |
| 264 | West Bengal | 31-08-2020 | M | 14.87 | 33298644 | 47.48 | East | 22.9868 | 87.855 |
| 265 | West Bengal | 30-09-2020 | M | 9.35 | 35707239 | 47.73 | East | 22.9868 | 87.855 |
| 266 | West Bengal | 31-10-2020 | M | 9.98 | 33962549 | 45.63 | East | 22.9868 | 87.855 |
267 rows × 9 columns
india = pd.read_csv(r'C:\Users\Dell\Desktop\dataset\Unemployment_in_India.csv')
india
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 764 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 765 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 766 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 767 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
768 rows × 7 columns
india.head(5)
df.head(5)
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Region.1 | longitude | latitude | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-01-2020 | M | 5.48 | 16635535 | 41.02 | South | 15.9129 | 79.74 |
| 1 | Andhra Pradesh | 29-02-2020 | M | 5.83 | 16545652 | 40.90 | South | 15.9129 | 79.74 |
| 2 | Andhra Pradesh | 31-03-2020 | M | 5.79 | 15881197 | 39.18 | South | 15.9129 | 79.74 |
| 3 | Andhra Pradesh | 30-04-2020 | M | 20.51 | 11336911 | 33.10 | South | 15.9129 | 79.74 |
| 4 | Andhra Pradesh | 31-05-2020 | M | 17.43 | 12988845 | 36.46 | South | 15.9129 | 79.74 |
df.shape
(267, 9)
india.shape
(768, 7)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 267 entries, 0 to 266 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Region 267 non-null object 1 Date 267 non-null object 2 Frequency 267 non-null object 3 Estimated Unemployment Rate (%) 267 non-null float64 4 Estimated Employed 267 non-null int64 5 Estimated Labour Participation Rate (%) 267 non-null float64 6 Region.1 267 non-null object 7 longitude 267 non-null float64 8 latitude 267 non-null float64 dtypes: float64(4), int64(1), object(4) memory usage: 18.9+ KB
india.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Region 740 non-null object 1 Date 740 non-null object 2 Frequency 740 non-null object 3 Estimated Unemployment Rate (%) 740 non-null float64 4 Estimated Employed 740 non-null float64 5 Estimated Labour Participation Rate (%) 740 non-null float64 6 Area 740 non-null object dtypes: float64(3), object(4) memory usage: 42.1+ KB
df.isnull().sum()
Region 0 Date 0 Frequency 0 Estimated Unemployment Rate (%) 0 Estimated Employed 0 Estimated Labour Participation Rate (%) 0 Region.1 0 longitude 0 latitude 0 dtype: int64
df.describe()
| Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | longitude | latitude | |
|---|---|---|---|---|---|
| count | 267.000000 | 2.670000e+02 | 267.000000 | 267.000000 | 267.000000 |
| mean | 12.236929 | 1.396211e+07 | 41.681573 | 22.826048 | 80.532425 |
| std | 10.803283 | 1.336632e+07 | 7.845419 | 6.270731 | 5.831738 |
| min | 0.500000 | 1.175420e+05 | 16.770000 | 10.850500 | 71.192400 |
| 25% | 4.845000 | 2.838930e+06 | 37.265000 | 18.112400 | 76.085600 |
| 50% | 9.650000 | 9.732417e+06 | 40.390000 | 23.610200 | 79.019300 |
| 75% | 16.755000 | 2.187869e+07 | 44.055000 | 27.278400 | 85.279900 |
| max | 75.850000 | 5.943376e+07 | 69.690000 | 33.778200 | 92.937600 |
india.describe()
| Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | |
|---|---|---|---|
| count | 740.000000 | 7.400000e+02 | 740.000000 |
| mean | 11.787946 | 7.204460e+06 | 42.630122 |
| std | 10.721298 | 8.087988e+06 | 8.111094 |
| min | 0.000000 | 4.942000e+04 | 13.330000 |
| 25% | 4.657500 | 1.190404e+06 | 38.062500 |
| 50% | 8.350000 | 4.744178e+06 | 41.160000 |
| 75% | 15.887500 | 1.127549e+07 | 45.505000 |
| max | 76.740000 | 4.577751e+07 | 72.570000 |
df.duplicated().sum()
0
india.duplicated().sum()
27
df.columns =['States','Date','Frequency','Estimated Unemployment Rate','Estimated Employed','Estimated Labour Participation Rate','Region','longitude','latitude']
df.columns
Index(['States', 'Date', 'Frequency', 'Estimated Unemployment Rate',
'Estimated Employed', 'Estimated Labour Participation Rate', 'Region',
'longitude', 'latitude'],
dtype='object')
india.columns =['States','Date','Frequency','Estimated Unemployment Rate','Estimated Employed','Estimated Labour Participation Rate','Area']
india.columns
Index(['States', 'Date', 'Frequency', 'Estimated Unemployment Rate',
'Estimated Employed', 'Estimated Labour Participation Rate', 'Area'],
dtype='object')
df.head(2)
| States | Date | Frequency | Estimated Unemployment Rate | Estimated Employed | Estimated Labour Participation Rate | Region | longitude | latitude | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-01-2020 | M | 5.48 | 16635535 | 41.02 | South | 15.9129 | 79.74 |
| 1 | Andhra Pradesh | 29-02-2020 | M | 5.83 | 16545652 | 40.90 | South | 15.9129 | 79.74 |
india.head(2)
| States | Date | Frequency | Estimated Unemployment Rate | Estimated Employed | Estimated Labour Participation Rate | Area | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural |
df['Date'] = pd.to_datetime(df['Date'],dayfirst=True)
df['Frequency']= df['Frequency'].astype('category')
df['Month'] = df['Date'].dt.month
df['MonthNumber'] = df['Month'].apply(lambda x : int(x))
df['MonthName'] = df['MonthNumber'].apply(lambda x: calendar.month_abbr[x])
df['Region'] = df['Region'].astype('category')
df.drop(columns='Month',inplace=True)
df.describe()
| Date | Estimated Unemployment Rate | Estimated Employed | Estimated Labour Participation Rate | longitude | latitude | MonthNumber | |
|---|---|---|---|---|---|---|---|
| count | 267 | 267.000000 | 2.670000e+02 | 267.000000 | 267.000000 | 267.000000 | 267.000000 |
| mean | 2020-06-16 09:15:30.337078528 | 12.236929 | 1.396211e+07 | 41.681573 | 22.826048 | 80.532425 | 5.535581 |
| min | 2020-01-31 00:00:00 | 0.500000 | 1.175420e+05 | 16.770000 | 10.850500 | 71.192400 | 1.000000 |
| 25% | 2020-03-31 00:00:00 | 4.845000 | 2.838930e+06 | 37.265000 | 18.112400 | 76.085600 | 3.000000 |
| 50% | 2020-06-30 00:00:00 | 9.650000 | 9.732417e+06 | 40.390000 | 23.610200 | 79.019300 | 6.000000 |
| 75% | 2020-08-31 00:00:00 | 16.755000 | 2.187869e+07 | 44.055000 | 27.278400 | 85.279900 | 8.000000 |
| max | 2020-10-31 00:00:00 | 75.850000 | 5.943376e+07 | 69.690000 | 33.778200 | 92.937600 | 10.000000 |
| std | NaN | 10.803283 | 1.336632e+07 | 7.845419 | 6.270731 | 5.831738 | 2.870915 |
round(df[['Estimated Unemployment Rate', 'Estimated Employed', 'Estimated Labour Participation Rate']].describe().T,2)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Estimated Unemployment Rate | 267.0 | 12.24 | 10.80 | 0.50 | 4.84 | 9.65 | 16.76 | 75.85 |
| Estimated Employed | 267.0 | 13962105.72 | 13366318.36 | 117542.00 | 2838930.50 | 9732417.00 | 21878686.00 | 59433759.00 |
| Estimated Labour Participation Rate | 267.0 | 41.68 | 7.85 | 16.77 | 37.26 | 40.39 | 44.06 | 69.69 |
regionStats = df.groupby(['Region'])[['Estimated Unemployment Rate',
'Estimated Employed',
'Estimated Labour Participation Rate']].mean().reset_index()
round(regionStats,2)
| Region | Estimated Unemployment Rate | Estimated Employed | Estimated Labour Participation Rate | |
|---|---|---|---|---|
| 0 | East | 13.92 | 19602366.90 | 40.11 |
| 1 | North | 15.89 | 13072487.92 | 38.70 |
| 2 | Northeast | 10.95 | 3617105.53 | 52.06 |
| 3 | South | 10.45 | 14040589.33 | 40.44 |
| 4 | West | 8.24 | 18623512.72 | 41.26 |
heatMap = df[['Estimated Unemployment Rate', 'Estimated Employed',
'Estimated Labour Participation Rate', 'longitude', 'latitude', 'MonthNumber']]
heatMap = heatMap.corr()
plt.figure(figsize=(23,8))
sns.heatmap(heatMap, annot=True,cmap='magma', fmt='.3f', linewidths=1)
plt.title('heatMap')
plt.show()
fig = px.box(
df,
x='States',
y='Estimated Unemployment Rate',
color='States',
title='unemploymentRate',
template='plotly'
)
fig.show()
#plotting a "Bar-plot" to find the "average unemployment rate in each state"
newDF = df[['Estimated Unemployment Rate','States']]
#grouping the dataframe by 'States' and finding the corresponding 'mean'
newDF = newDF.groupby('States').mean().reset_index()
#sorting the values in the dataframe
newDF = newDF.sort_values('Estimated Unemployment Rate')
fig = px.bar(newDF,
x='States',
y='Estimated Unemployment Rate',
color='States',
title='State-wise Average Employment Rate')
fig.show()
fig = px.bar(df,
x='Region',
y='Estimated Unemployment Rate',
animation_frame = 'MonthName',
color='States',
title='Region-wise Unemployment Rate',
height=800)
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1500
fig.show()
unempDF = df[['States','Region','Estimated Unemployment Rate','Estimated Employed','Estimated Labour Participation Rate']]
unempDF = unempDF.groupby(['Region','States'])['Estimated Unemployment Rate'].mean().reset_index()
#printing the new dataframe
unempDF.head(4)
| Region | States | Estimated Unemployment Rate | |
|---|---|---|---|
| 0 | East | Andhra Pradesh | NaN |
| 1 | East | Assam | NaN |
| 2 | East | Bihar | 19.471 |
| 3 | East | Chhattisgarh | NaN |
fig = px.sunburst(unempDF,
path=['Region','States'],
values='Estimated Unemployment Rate',
title= 'unemployment rate in each region and state',
height=650)
fig.show()
#!pip install sunburst
Requirement already satisfied: sunburst in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (1.0.0a2) Requirement already satisfied: matplotlib in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from sunburst) (3.7.2) Requirement already satisfied: typing in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from sunburst) (3.7.4.3) Requirement already satisfied: contourpy>=1.0.1 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (1.0.5) Requirement already satisfied: cycler>=0.10 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (1.4.4) Requirement already satisfied: numpy>=1.20 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (1.24.3) Requirement already satisfied: packaging>=20.0 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (23.1) Requirement already satisfied: pillow>=6.2.0 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (9.4.0) Requirement already satisfied: pyparsing<3.1,>=2.3.1 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from matplotlib->sunburst) (2.8.2) Requirement already satisfied: six>=1.5 in c:\users\dell\anaconda3\annaconda2023\lib\site-packages (from python-dateutil>=2.7->matplotlib->sunburst) (1.16.0)
fig = px.scatter_geo(df,'longitude', 'latitude',
color="Region",
hover_name="States",
size="Estimated Unemployment Rate",
animation_frame="MonthName",
scope='asia',
title='Lockdown Impact throughout India')
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 1200
#updating the geospatial axes ranges and ocean color
fig.update_geos(lataxis_range=[5,35],
lonaxis_range=[65, 100],
oceancolor="#6dd5ed",
showocean=True)
fig.show()
df47 = df[(df['MonthNumber'] >= 4) & (df['MonthNumber'] <=7)]
#filtering dataset between month 1 and 4 (inclusive) - before lockdown
df14 = df[(df['MonthNumber'] >= 1) & (df['MonthNumber'] <=4)]
df47g = df47.groupby('States')['Estimated Unemployment Rate'].mean().reset_index()
#grouping the dataframe on the basis of "States" and finding the corresponding mean values
df14g = df14.groupby('States')['Estimated Unemployment Rate'].mean().reset_index()
#clubbing the 2 dataframe values
df47g['Unemployment Rate before lockdown'] = df14g['Estimated Unemployment Rate']
#renaming the column values for better understanding
df47g.columns = ['States','unemploymentRate A/ lockdown','unemploymentRate B/ lockdown']
#displaying the top results
df47g.head()
| States | unemploymentRate A/ lockdown | unemploymentRate B/ lockdown | |
|---|---|---|---|
| 0 | Andhra Pradesh | 12.3975 | 9.4025 |
| 1 | Assam | 6.2450 | 6.2250 |
| 2 | Bihar | 30.8025 | 20.7425 |
| 3 | Chhattisgarh | 9.6025 | 7.2450 |
| 4 | Delhi | 24.3600 | 17.6975 |
df47g['% change in unemployment'] = round(df47g['unemploymentRate A/ lockdown'] - df47g['unemploymentRate B/ lockdown']/df47g['unemploymentRate B/ lockdown'],2)
df47g = df47g.sort_values('% change in unemployment')
fig = px.bar(df47g, x='States',y='% change in unemployment',
color='% change in unemployment',
title='% change in Unemployment A/ Lockdown')
def sort_impact(x):
if x <= 10:
#impactedState
return ''
elif x <= 20:
#hardImpactedState
return ''
elif x <= 30:
#harderImpactedState
return ''
elif x <= 40:
#hardestImpactedState
return ''
return x
df47g['impactStatus'] = df47g['% change in unemployment'].apply(lambda x:sort_impact(x))
fig = px.bar(df47g,
y='States',
x='% change in unemployment',
color='impactStatus',
title='Lockdown Impact on Employment in India')
fig.show()